{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import sys,random\n",
    "import pickle\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('train_3000_featurescore'):\n",
    "    os.mkdir('train_3000_featurescore')\n",
    "if not os.path.exists('train_3000_model'):\n",
    "    os.mkdir('train_3000_model')\n",
    "if not os.path.exists('train_3000_preds'):\n",
    "    os.mkdir('train_3000_preds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x_d_cols = pd.read_csv('./rank_d_feature_score.csv')\n",
    "train_x_d_cols = list(train_x_d_cols.iloc[10:810].feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正负样本不均衡 {0.0: 2639, 1.0: 361}\n",
      "CPU times: user 20.9 s, sys: 2.11 s, total: 23 s\n",
      "Wall time: 23 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#use rank_d  and rank_nd feature\n",
    "#load data\n",
    "train_x_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id'])\n",
    "train_x_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])\n",
    "train_x_int = pd.read_csv('../../preprocess_data/train_x_int.csv').drop(columns=['id','tag']) # valid中无‘tag’\n",
    "train_x_d = pd.read_csv('../../preprocess_data/train_x_float_rank_d.csv',usecols=train_x_d_cols)\n",
    "train_x_nd = pd.read_csv('../../preprocess_data/train_x_float_rank_nd.csv').drop(columns=['id'])\n",
    "\n",
    "\n",
    "train_x = pd.concat([train_x_date,train_x_null,train_x_int,train_x_d,train_x_nd],axis=1,ignore_index=True,copy=False)\n",
    "train_x = train_x.iloc[30465:]\n",
    "train_y = pd.read_csv('../../preprocess_data/train_y_33465.csv')\n",
    "train_y = train_y.iloc[30465:]\n",
    "print('正负样本不均衡',train_y.label.value_counts().to_dict())\n",
    "dtrain = xgb.DMatrix(train_x.values, label=train_y.values)\n",
    "\n",
    "unlabel_date = pd.read_csv('../../preprocess_data/unlabel_x_date.csv')\n",
    "unlabel_null = pd.read_csv('../../preprocess_data/unlabel_x_null.csv').drop(columns=['id'])\n",
    "unlabel_int = pd.read_csv('../../preprocess_data/unlabel_x_int.csv').drop(columns=['id','tag'])\n",
    "unlabel_d = pd.read_csv('../../preprocess_data/unlabel_x_float_rank_d.csv',usecols=train_x_d_cols)\n",
    "unlabel_nd = pd.read_csv('../../preprocess_data/unlabel_x_float_rank_nd.csv').drop(columns=['id'])\n",
    "unlabel = pd.concat([unlabel_date,unlabel_null,unlabel_int,unlabel_d,unlabel_nd],axis=1,ignore_index=True,copy=False)\n",
    "# pd.concat后特征名没有了，变成数字了\n",
    "test_id = unlabel.iloc[:,0].values\n",
    "test = unlabel.drop(0,axis=1)\n",
    "dtest = xgb.DMatrix(test.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):\n",
    "    if max_depth==6:\n",
    "        num_boost_round = 600\n",
    "    elif max_depth==7:\n",
    "        num_boost_round = 500\n",
    "    elif max_depth==8:\n",
    "        num_boost_round = 350\n",
    "    \n",
    "    params={\n",
    "    'booster':'gbtree',\n",
    "    'objective': 'binary:logistic',\n",
    "    'early_stopping_rounds':100,\n",
    "    'scale_pos_weight': float(len(train_y)-np.sum(train_y.values))/float(np.sum(train_y.values)),  # 负例样本除以正例样本\n",
    "    'eval_metric': 'auc',\n",
    "    'gamma':gamma,\n",
    "    'max_depth':max_depth,\n",
    "    'lambda':lambd,\n",
    "    'subsample':subsample,\n",
    "    'colsample_bytree':colsample_bytree,\n",
    "    'min_child_weight':min_child_weight, \n",
    "    'eta': 0.04,\n",
    "    'seed':random_seed,\n",
    "    'nthread':16\n",
    "        }\n",
    "    watchlist  = [(dtrain,'train')]\n",
    "    model = xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=watchlist)\n",
    "    model.save_model('./train_3000_model/xgb{0}.model'.format(iteration))\n",
    "    \n",
    "    #predict test set\n",
    "    test_y = model.predict(dtest)\n",
    "    test_result = pd.DataFrame(test_id,columns=[\"id\"])\n",
    "    test_result['score'] = test_y\n",
    "    test_result.to_csv(\"./train_3000_preds/xgb{0}.csv\".format(iteration),index=None,encoding='utf-8')\n",
    "    \n",
    "    #get feature score\n",
    "    feature_score = model.get_fscore()\n",
    "    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)\n",
    "    fs = []\n",
    "    for (key,value) in feature_score:\n",
    "        fs.append(\"{0},{1}\\n\".format(key,value))\n",
    "    \n",
    "    with open('./train_3000_featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:\n",
    "        f.writelines(\"feature,score\\n\")\n",
    "        f.writelines(fs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "random_seed = list(range(0,10000,100))\n",
    "gamma = [i/1000.0 for i in range(100,200,2)]\n",
    "max_depth = [6,7,8]\n",
    "lambd = list(range(200,400,2))\n",
    "subsample = [i/1000.0 for i in range(600,700,2)]\n",
    "colsample_bytree = [i/1000.0 for i in range(250,350,2)]\n",
    "min_child_weight = [i/1000.0 for i in range(200,300,2)]\n",
    "random.shuffle(random_seed)\n",
    "random.shuffle(gamma)\n",
    "random.shuffle(max_depth)\n",
    "random.shuffle(lambd)\n",
    "random.shuffle(subsample)\n",
    "random.shuffle(colsample_bytree)\n",
    "random.shuffle(min_child_weight)\n",
    "\n",
    "with open('params.pkl','wb') as f:\n",
    "    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)\n",
    "\n",
    "#to reproduce my result, uncomment following lines\n",
    "\"\"\"\n",
    "with open('params_for_reproducing.pkl','rb') as f:\n",
    "    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    \n",
    "\"\"\"\n",
    "\n",
    "for i in range(100):\n",
    "    pipeline(dtrain,dtest,test_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 整合预测结果\n",
    "files = os.listdir('train_3000_preds')\n",
    "pred = pd.read_csv('./train_3000_preds/%s' %files[0])  \n",
    "id = pred.id\n",
    "score = pred.score\n",
    "for file in files[1:]:\n",
    "    score += pd.read_csv('./train_3000_preds/%s' %files[0]).score\n",
    "score /= len(files)\n",
    "avg_pred = pd.DataFrame(id,columns=['id'])\n",
    "avg_pred['score'] = score\n",
    "\n",
    "avg_pred.to_csv('./train_3000_avg_pred.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
